# Load required libraries
require(mlr3learners) 
## Loading required package: mlr3learners
## Warning: package 'mlr3learners' was built under R version 4.4.1
## Loading required package: mlr3
## Warning: package 'mlr3' was built under R version 4.4.1
require(kknn)  
## Loading required package: kknn
require(class)
## Loading required package: class
require(ggplot2)
## Loading required package: ggplot2
library(mlr3)
library(mlr3viz)
## Warning: package 'mlr3viz' was built under R version 4.4.1
library(ggplot2)
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.1
library(sqldf)
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.4.1
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: <9A3F5E83-2A35-33C3-9C5A-5255B116A1BE> /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/modules/R_X11.so
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## tcltk DLL is linked to '/opt/X11/lib/libX11.6.dylib'
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
library(codebookr)
library(codebook)
## Warning: package 'codebook' was built under R version 4.4.1
## 
## Attaching package: 'codebook'
## The following object is masked from 'package:codebookr':
## 
##     codebook
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(labelled)
## Warning: package 'labelled' was built under R version 4.4.1
## 
## Attaching package: 'labelled'
## The following object is masked from 'package:codebook':
## 
##     to_factor
library(readr)
library(tibble)

# Step 1: Read raw data files (coded values + labels)
data_values <- read_csv("~/Desktop/GRIT/Kibera/originalCsv/kibera_labels.data.csv")
## New names:
## • `2.19` -> `2.19...48`
## • `2.19` -> `2.19...49`
## • `2.19` -> `2.19...50`
## Rows: 522 Columns: 171
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (171): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_labels <- read_csv("~/Desktop/GRIT/Kibera/originalCsv/kibera_values_data.csv")
## Rows: 522 Columns: 171
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (171): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Extract variable labels and drop metadata/header rows
variable_labels <- as.character(unlist(data_values[1, ]))

data_values <- data_values[-c(1, 2), ]
data_labels <- data_labels[-c(1, 2), ]

# Step 3: Rename variables that start with numbers
names(data_values) <- ifelse(grepl("^[0-9]", names(data_values)),
                             paste0("x", names(data_values)),
                             names(data_values))
names(data_labels) <- ifelse(grepl("^[0-9]", names(data_labels)),
                             paste0("x", names(data_labels)),
                             names(data_labels))


# Determine overlapping variables between data_values and data_labels
common_cols <- intersect(names(data_values), names(data_labels))

codebook_list <- list()

# Step 4: Build correct label-value mapping (Value = code, Label = text)
for (col in common_cols) {
  values <- as.character(data_values[[col]])  # coded values: "1", "2", etc.
  labels <- as.character(data_labels[[col]])  # text labels: "Female", "Male"
  
  df <- data.frame(Value = values, Label = labels, stringsAsFactors = FALSE) %>%
    filter(!is.na(Value), !is.na(Label)) %>%
    distinct()

  # ✅ Enforce correct structure: Value = code, Label = label
  df <- df %>%
    mutate(Value = as.character(Value), Label = as.character(Label))

  if (nrow(df) > 0 && !all(df$Value == df$Label)) {
    df$Variable <- col
    codebook_list[[col]] <- df
  }
}




# Step 5: Combine into a long-format codebook
codebook_df <- bind_rows(codebook_list) %>%
  select(Variable, Value, Label)


# Step 7: Build nested dictionary for value labels
label_dict <- codebook_df %>%
  group_by(Variable) %>%
  summarise(mapping = list(setNames(Label, Value))) %>%
  deframe()

# Step 8: Apply value labels automatically
for (var in names(label_dict)) {
  if (var %in% names(data_values)) {
    # Ensure variables are character so labels stick
    data_values[[var]] <- as.character(data_values[[var]])
    val_labels(data_values[[var]]) <- label_dict[[var]]
  }
}

# Step 6: Apply variable labels (question text) to data_values
for (i in seq_along(data_values)) {
  var_label(data_values[[i]]) <- variable_labels[i]
}

# Step 9: Save codebook CSV
write_csv(codebook_df, "~/Desktop/GRIT/Kibera/codebook/auto_generated_codebook.csv")

# Step 10: Optional – Generate a codebook report (interactive viewer or RMarkdown)
# Only include non-free-text variables in the summary
too_unique_vars <- names(data_values)[
  sapply(data_values, function(x) is.character(x) && (length(unique(x)) > 100 || any(is.na(x))))
]



safe_vars <- setdiff(names(data_values), too_unique_vars)
codebook_output <- codebook(data_values[, safe_vars])
## No missing values.
#save the codebook

saveRDS(data_values, file = "~/Desktop/GRIT/Kibera/codebook/codebook_labelled_data.rds")